# Our Github link is: https://github.com/DavidChen25/Covid-19-Sentiment-Analysis
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from collections import Counter
import jieba
import matplotlib.font_manager as fm
from PIL import Image
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS
import numpy as np
from termcolor import colored
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from matplotlib import pyplot as plt
from matplotlib import ticker
import seaborn as sns
import plotly.express as px
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\cheny\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\cheny\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package vader_lexicon to [nltk_data] C:\Users\cheny\AppData\Roaming\nltk_data... [nltk_data] Package vader_lexicon is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] C:\Users\cheny\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] C:\Users\cheny\AppData\Roaming\nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date!
data_train = pd.read_csv('Corona_NLP_train.csv', encoding='latin1')
data_test = pd.read_csv('Corona_NLP_test.csv', encoding='latin1')
data_train.columns = ['ID', 'ScreenName', 'Location', 'Date', 'Tweet', 'Sentiment']
data_test.columns = ['ID', 'ScreenName', 'Location', 'Date', 'Tweet', 'Sentiment']
data_train.drop(['ID', 'Date', 'ScreenName', 'Location'], axis = 1, inplace = True)
data_test.drop(['ID', 'Date', 'ScreenName', 'Location'], axis = 1, inplace = True)
print(data_train.dropna())
print(data_test.dropna())
Tweet Sentiment
0 @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i... Neutral
1 advice Talk to your neighbours family to excha... Positive
2 Coronavirus Australia: Woolworths to give elde... Positive
3 My food stock is not the only one which is emp... Positive
4 Me, ready to go at supermarket during the #COV... Extremely Negative
... ... ...
41152 Airline pilots offering to stock supermarket s... Neutral
41153 Response to complaint not provided citing COVI... Extremely Negative
41154 You know it??s getting tough when @KameronWild... Positive
41155 Is it wrong that the smell of hand sanitizer i... Neutral
41156 @TartiiCat Well new/used Rift S are going for ... Negative
[41157 rows x 2 columns]
Tweet Sentiment
0 TRENDING: New Yorkers encounter empty supermar... Extremely Negative
1 When I couldn't find hand sanitizer at Fred Me... Positive
2 Find out how you can protect yourself and love... Extremely Positive
3 #Panic buying hits #NewYork City as anxious sh... Negative
4 #toiletpaper #dunnypaper #coronavirus #coronav... Neutral
... ... ...
3793 Meanwhile In A Supermarket in Israel -- People... Positive
3794 Did you panic buy a lot of non-perishable item... Negative
3795 Asst Prof of Economics @cconces was on @NBCPhi... Neutral
3796 Gov need to do somethings instead of biar je r... Extremely Negative
3797 I and @ForestandPaper members are committed to... Extremely Positive
[3798 rows x 2 columns]
remove_url=lambda x: re.sub(r'https\S+' , '',str(x))
data_train['Tweet'] = data_train['Tweet'].apply(remove_url)
#coverting all tweets to lowercase
to_lower=lambda x: x.lower()
data_train['Tweet'] = data_train['Tweet'].apply(to_lower)
#remove punctutation
remove_puncts=lambda x: x.translate(str.maketrans('','',string.punctuation))
data_train['Tweet'] = data_train['Tweet'].apply(remove_puncts)
#remove stopwords
more_words=['covid','covid19']
stop_words=set(stopwords.words('English'))
stop_words.update(more_words)
# Function to process tweets
def clean_tweet(data, wordNetLemmatizer):
data['Tweet'] = data['Tweet']
data['Tweet'] = data['Tweet'].str.replace("@[\w]*","")
data['Tweet'] = data['Tweet'].str.replace("[^a-zA-Z' ]","")
data['Tweet'] = data['Tweet'].replace(re.compile(r"((www\.[^\s]+)|(https?://[^\s]+))"), "")
data['Tweet'] = data['Tweet'].replace(re.compile(r"(^| ).( |$)"), " ")
data['Tweet'] = data['Tweet'].str.split()
data['Tweet'] = data['Tweet'].apply(lambda tweet: [word for word in tweet if word not in stop_words])
data['Tweet'] = data['Tweet'].apply(lambda tweet: [wordNetLemmatizer.lemmatize(word) for word in tweet])
data['Tweet'] = data['Tweet'].apply(lambda tweet: ' '.join(tweet))
return data
# Define processing methods
wordNetLemmatizer = WordNetLemmatizer()
# Pre-processing the tweets
train_data = clean_tweet(data_train, wordNetLemmatizer)
train_data.to_csv('clean_train.csv', index = False)
test_data = clean_tweet(data_test, wordNetLemmatizer)
test_data.to_csv('clean_test.csv', index = False)
words_list=[word for line in train_data['Tweet'] for word in line.split()]
words_list[:5]
word_counts=Counter(words_list).most_common(50)
words_df=pd.DataFrame(word_counts)
words_df.columns=['word','frq']
fig = px.bar(words_df,x='word',y='frq',title='Most common words')
fig.update_xaxes(tickangle = -45)
fig.show()
from textblob import TextBlob
train_data['polarity'] = train_data['Tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
train_data['subjectivity'] = train_data['Tweet'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
# The histograms of polarity and subjectivity scores for all tweets in the dataset.
fig = plt.figure(figsize=(8, 6))
train_data['polarity'].hist()
plt.xlabel('Polarity Score', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
fig = plt.figure(figsize=(8, 6))
train_data['subjectivity'].hist()
plt.xlabel('Subjectivity Score', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()
# Sentiment count
sns.set_style('darkgrid')
plt.figure(figsize = (8, 8))
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.countplot(x = train_data['Sentiment'] , palette = 'viridis')
plt.show()
from wordcloud import WordCloud
# texts from all tweets
words = ' '.join([text for text in train_data['Tweet']])
wordcloud = WordCloud(width=800, height=500, random_state=21, min_font_size = 10, max_font_size=110, background_color="white").generate(words)
plt.figure(figsize=(8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
ExPositive_words = ' '.join([text for text in train_data['Tweet'][ train_data['Sentiment'] == 'Extremely Positive']])
wordcloud = WordCloud(width=800, height=500, random_state=21, min_font_size = 10, max_font_size=110, background_color="white").generate(ExPositive_words)
plt.figure(figsize=(8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
ExNegative_words = ' '.join([text for text in train_data['Tweet'][ train_data['Sentiment'] == 'Extremely Negative']])
wordcloud = WordCloud(width=800, height=500, random_state=21, min_font_size = 10, max_font_size=110, background_color="white").generate(ExNegative_words)
plt.figure(figsize=(8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
Neutral_words = ' '.join([text for text in train_data['Tweet'][ train_data['Sentiment'] == 'Neutral']])
wordcloud = WordCloud(width=800, height=500, random_state=21, min_font_size = 10, max_font_size=110, background_color="white").generate(Neutral_words)
plt.figure(figsize=(8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad = 0)
plt.show()
# Tf-IDF (Term Frequency — Inverse Document Frequency)
tfidfVectorizer = TfidfVectorizer(min_df = 5, max_features = 1000)
tfidfVectorizer.fit(train_data['Tweet'].apply(lambda x: np.str_(x)))
train_tweet_vector = tfidfVectorizer.transform(train_data['Tweet'].apply(lambda x: np.str_(x)))
test_tweet_vector = tfidfVectorizer.transform(test_data['Tweet'].apply(lambda x: np.str_(x)))
# part-of-speech taggings
word_tokenized = sent_tokenize(str(train_data['Tweet']))
for i in word_tokenized:
wordsList = nltk.word_tokenize(i)
wordsList = [w for w in wordsList if not w in stop_words]
pos_tag = nltk.pos_tag(wordsList)
print(pos_tag)
[('0', 'CD'), ('menyrbie', 'NN'), ('philgahan', 'NN'), ('chrisitv', 'VBD'), ('1', 'CD'), ('advice', 'NN'), ('talk', 'NN'), ('neighbour', 'IN'), ('family', 'NN'), ('exchange', 'NN'), ('phone', 'NN'), ('nu', 'NN'), ('...', ':'), ('2', 'CD'), ('coronavirus', 'NN'), ('australia', 'NNS'), ('woolworth', 'VBP'), ('give', 'VB'), ('elderly', 'JJ'), ('...', ':'), ('3', 'CD'), ('food', 'NN'), ('stock', 'NN'), ('one', 'CD'), ('emptyplease', 'NN'), ('dont', 'NN'), ('panic', 'JJ'), ('enough', 'RB'), ('f', 'NN'), ('...', ':'), ('4', 'CD'), ('ready', 'JJ'), ('go', 'VBP'), ('supermarket', 'JJ'), ('outbreaknot', 'NN'), ('im', 'NN'), ('paranoid', 'NN'), ('f', 'NN'), ('...', ':'), ('...', ':'), ('41152', 'CD'), ('airline', 'NN'), ('pilot', 'NN'), ('offering', 'VBG'), ('stock', 'NN'), ('supermarket', 'NN'), ('shelf', 'NN'), ('...', ':'), ('41153', 'CD'), ('response', 'NN'), ('complaint', 'NN'), ('provided', 'VBD'), ('citing', 'VBG'), ('related', 'JJ'), ('del', 'NN'), ('...', ':'), ('41154', 'CD'), ('know', 'VBP'), ('getting', 'VBG'), ('tough', 'JJ'), ('kameronwilds', 'NNS'), ('rationing', 'VBG'), ('toil', 'NN'), ('...', ':'), ('41155', 'CD'), ('wrong', 'JJ'), ('smell', 'NN'), ('hand', 'NN'), ('sanitizer', 'NN'), ('starting', 'VBG'), ('turn', 'NN'), ('oncor', 'NN'), ('...', ':'), ('41156', 'CD'), ('tartiicat', 'NN'), ('well', 'RB'), ('newused', 'JJ'), ('rift', 'NN'), ('going', 'VBG'), ('amazon', 'JJ'), ('rn', 'NN'), ('al', 'NN'), ('...', ':'), ('Name', 'NN'), (':', ':'), ('Tweet', 'NN'), (',', ','), ('Length', 'NNP'), (':', ':'), ('41157', 'CD'), (',', ','), ('dtype', 'NN'), (':', ':'), ('object', 'NN')]
# N-grams
# The following part of code is from Ayush Pareek's Github: "https://github.com/ayushoriginal/Sentiment-Analysis-Twitter"
# I generated the unigram + negation handling feature but do not know how to fit the model. The part of uncomplete feature
# builded ode is show in the last comment.
import nltk
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
def unigrams(tweets, add_negtn_feat):
unigrams_fd = nltk.FreqDist()
for words in tweets:
words_uni = words
unigrams_fd.update(words)
#unigrams_sorted = nltk.FreqDist(unigrams).keys()
unigrams_sorted = unigrams_fd.keys()
mostcommon=unigrams_fd.most_common(50)
#bigrams_sorted = nltk.FreqDist(bigrams).keys()
#trigrams_sorted = nltk.FreqDist(trigrams).keys()
def get_word_features(words):
bag = {}
words_uni = [ 'has(%s)'% ug for ug in words ]
for f in words_uni:
bag[f] = 1
print(bag)
#bag = collections.Counter(words_uni+words_bi+words_tri)
return bag
negtn_regex = re.compile( r"""(?:
^(?:never|no|nothing|nowhere|noone|none|not|
havent|hasnt|hadnt|cant|couldnt|shouldnt|
wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint
)$
)
|
n't
""", re.X)
def get_negation_features(words):
INF = 0.0
negtn = [ bool(negtn_regex.search(w)) for w in words ]
left = [0.0] * len(words)
prev = 0.0
for i in range(0,len(words)):
if( negtn[i] ):
prev = 1.0
left[i] = prev
prev = max( 0.0, prev-0.1)
right = [0.0] * len(words)
prev = 0.0
for i in reversed(range(0,len(words))):
if( negtn[i] ):
prev = 1.0
right[i] = prev
prev = max( 0.0, prev-0.1)
return dict( zip(
['neg_l('+w+')' for w in words] + ['neg_r('+w+')' for w in words],
left + right ) )
def extract_features(words):
features = {}
word_features = get_word_features(words)
features.update( word_features )
if add_negtn_feat:
negation_features = get_negation_features(words)
features.update( negation_features )
return features
a=[]
for sentence in tweets:
feature=extract_features(sentence)
a.append(feature)
return a, unigrams_sorted, mostcommon
features, unigrams_sorted, mostcommon = unigrams(['good good good not good'.split()], True)
print(features)
print(unigrams_sorted)
print(mostcommon)
def text():
strg=''
for sentence in new_sentence:
temp=" ".join(sentence)
strg+=temp
print(string)
return string
def bag_of_words():
feature={}
for sentence in new_sentence:
temp= dict([(word,True) for word in sentence])
feature.update(temp)
return feature
def bigram(words,score_fn=BigramAssocMeasures.chi_sq,n=1000):
bigram_finder=BigramCollocationFinder.from_words(words)
bigrams= bigram_finder.nbest(score_fn,n)
newBigrams = [u+v for (u,v) in bigrams]
return bag_of_words(newBigrams)
def build_features():
#feature = bag_of_words()
#feature = bigram(text(),score_fn=BigramAssocMeasures.chi_sq,n=500)
print(feature)
# The results are hard to read.
# I tried these functions that I want to extract those features by part but I still do not know how to combine those features
# and fit the model
{'has(good)': 1, 'has(not)': 1}
[{'has(good)': 1, 'has(not)': 1, 'neg_l(good)': 0.9, 'neg_l(not)': 1.0, 'neg_r(good)': 0.0, 'neg_r(not)': 1.0}]
dict_keys(['good', 'not'])
[('good', 4), ('not', 1)]
from sklearn.model_selection import train_test_split
train,valid = train_test_split(train_data,test_size = 0.2,random_state=0,stratify = train_data.Sentiment.values) #stratification means that the train_test_split method returns training and test subsets that have the same proportions of class labels as the input dataset.
print("train shape : ", train.shape)
print("valid shape : ", valid.shape)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
vectorizer = CountVectorizer(decode_error = 'replace',stop_words = stop)
X_train = vectorizer.fit_transform(train.Tweet.values)
X_valid = vectorizer.transform(valid.Tweet.values)
y_train = train.Sentiment.values
y_valid = valid.Sentiment.values
print("X_train.shape : ", X_train.shape)
print("X_train.shape : ", X_valid.shape)
print("y_train.shape : ", y_train.shape)
print("y_valid.shape : ", y_valid.shape)
train shape : (32925, 4) valid shape : (8232, 4) X_train.shape : (32925, 52991) X_train.shape : (8232, 52991) y_train.shape : (32925,) y_valid.shape : (8232,)
# Naive Bayes Classifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
naiveByes_clf = MultinomialNB()
naiveByes_clf.fit(X_train,y_train)
NB_prediction = naiveByes_clf.predict(X_valid)
NB_accuracy = accuracy_score(y_valid,NB_prediction)
print("training accuracy Score : ",naiveByes_clf.score(X_train,y_train))
print("Validation accuracy Score : ",NB_accuracy )
print(classification_report(NB_prediction,y_valid))
# Stochastic Gradient Descent-SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(loss = 'hinge', penalty = 'l2', random_state=0)
sgd_clf.fit(X_train,y_train)
sgd_prediction = sgd_clf.predict(X_valid)
sgd_accuracy = accuracy_score(y_valid,sgd_prediction)
print("Training accuracy Score : ",sgd_clf.score(X_train,y_train))
print("Validation accuracy Score : ",sgd_accuracy )
print(classification_report(sgd_prediction,y_valid))
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
rf_prediction = rf_clf.predict(X_valid)
rf_accuracy = accuracy_score(y_valid,rf_prediction)
print("Training accuracy Score : ",rf_clf.score(X_train,y_train))
print("Validation accuracy Score : ",rf_accuracy )
print(classification_report(rf_prediction,y_valid))
# Extreme Gradient Boosting
import xgboost as xgb
xgboost_clf = xgb.XGBClassifier()
xgboost_clf.fit(X_train, y_train)
xgb_prediction = xgboost_clf.predict(X_valid)
xgb_accuracy = accuracy_score(y_valid,xgb_prediction)
print("Training accuracy Score : ",xgboost_clf.score(X_train,y_train))
print("Validation accuracy Score : ",xgb_accuracy )
print(classification_report(xgb_prediction,y_valid))
# Support vector machine
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
svc_prediction = svc.predict(X_valid)
svc_accuracy = accuracy_score(y_valid,svc_prediction)
print("Training accuracy Score : ",svc.score(X_train,y_train))
print("Validation accuracy Score : ",svc_accuracy )
print(classification_report(svc_prediction,y_valid))
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_prediction = logreg.predict(X_valid)
logreg_accuracy = accuracy_score(y_valid,logreg_prediction)
print("Training accuracy Score : ",logreg.score(X_train,y_train))
print("Validation accuracy Score : ",logreg_accuracy )
print(classification_report(logreg_prediction,y_valid))
# Catboost Algorithm
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
clf2 = CatBoostClassifier()
clf2.fit(X_train, y_train,
eval_set=(X_valid, y_valid),
verbose=False
)
print('CatBoost model is fitted: ' + str(clf2.is_fitted()))
print('CatBoost model parameters:')
print(clf2.get_params())
catboost_prediction = clf2.predict(X_valid)
catboost_accuracy = accuracy_score(y_valid,catboost_prediction)
print("Training accuracy Score : ",clf2.score(X_train,y_train))
print("Validation accuracy Score : ",catboost_accuracy )
print(classification_report(catboost_prediction,y_valid))
training accuracy Score : 0.7523462414578588
Validation accuracy Score : 0.4597910592808552
precision recall f1-score support
Extremely Negative 0.29 0.62 0.40 508
Extremely Positive 0.35 0.62 0.44 744
Negative 0.54 0.42 0.47 2528
Neutral 0.27 0.71 0.39 587
Positive 0.67 0.39 0.50 3865
accuracy 0.46 8232
macro avg 0.42 0.55 0.44 8232
weighted avg 0.55 0.46 0.47 8232
Training accuracy Score : 0.9080334092634776
Validation accuracy Score : 0.5649902818270165
precision recall f1-score support
Extremely Negative 0.66 0.59 0.62 1223
Extremely Positive 0.65 0.64 0.65 1340
Negative 0.45 0.50 0.47 1766
Neutral 0.75 0.59 0.66 1968
Positive 0.45 0.53 0.48 1935
accuracy 0.56 8232
macro avg 0.59 0.57 0.58 8232
weighted avg 0.58 0.56 0.57 8232
Training accuracy Score : 0.9998177676537585
Validation accuracy Score : 0.5516277939747327
precision recall f1-score support
Extremely Negative 0.35 0.70 0.47 553
Extremely Positive 0.31 0.74 0.43 556
Negative 0.53 0.50 0.51 2073
Neutral 0.80 0.59 0.68 2079
Positive 0.64 0.49 0.56 2971
accuracy 0.55 8232
macro avg 0.53 0.60 0.53 8232
weighted avg 0.61 0.55 0.56 8232
[10:07:21] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
C:\Users\cheny\anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Training accuracy Score : 0.681366742596811
Validation accuracy Score : 0.5754373177842566
precision recall f1-score support
Extremely Negative 0.50 0.65 0.56 842
Extremely Positive 0.54 0.72 0.62 997
Negative 0.46 0.55 0.50 1657
Neutral 0.80 0.54 0.64 2316
Positive 0.58 0.55 0.56 2420
accuracy 0.58 8232
macro avg 0.58 0.60 0.58 8232
weighted avg 0.61 0.58 0.58 8232
Training accuracy Score : 0.9162338648443432
Validation accuracy Score : 0.5881924198250729
precision recall f1-score support
Extremely Negative 0.44 0.70 0.54 684
Extremely Positive 0.50 0.77 0.61 856
Negative 0.55 0.53 0.54 2057
Neutral 0.72 0.61 0.66 1800
Positive 0.66 0.53 0.59 2835
accuracy 0.59 8232
macro avg 0.57 0.63 0.59 8232
weighted avg 0.61 0.59 0.59 8232
C:\Users\cheny\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Training accuracy Score : 0.9553227031131359
Validation accuracy Score : 0.6000971817298348
precision recall f1-score support
Extremely Negative 0.59 0.64 0.62 1005
Extremely Positive 0.61 0.70 0.65 1150
Negative 0.52 0.54 0.53 1904
Neutral 0.71 0.63 0.67 1743
Positive 0.60 0.56 0.58 2430
accuracy 0.60 8232
macro avg 0.60 0.61 0.61 8232
weighted avg 0.60 0.60 0.60 8232
CatBoost model is fitted: True
CatBoost model parameters:
{}
Training accuracy Score : 0.6543356112376614
Validation accuracy Score : 0.6100583090379009
precision recall f1-score support
Extremely Negative 0.50 0.69 0.58 792
Extremely Positive 0.55 0.76 0.64 969
Negative 0.52 0.57 0.54 1797
Neutral 0.83 0.58 0.68 2201
Positive 0.63 0.58 0.60 2473
accuracy 0.61 8232
macro avg 0.61 0.64 0.61 8232
weighted avg 0.64 0.61 0.61 8232
#evaluation
models = pd.DataFrame({
'Model': ['Support Vector Machines', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Stochastic Gradient Decent', 'XGBoost','CatBoost'],
'Test accuracy': [svc_accuracy, logreg_accuracy,
rf_accuracy, NB_accuracy,
sgd_accuracy, xgb_accuracy,catboost_accuracy]})
models.sort_values(by='Test accuracy', ascending=False)
| Model | Test accuracy | |
|---|---|---|
| 6 | CatBoost | 0.610058 |
| 1 | Logistic Regression | 0.600097 |
| 0 | Support Vector Machines | 0.588192 |
| 5 | XGBoost | 0.575437 |
| 4 | Stochastic Gradient Decent | 0.564990 |
| 2 | Random Forest | 0.556365 |
| 3 | Naive Bayes | 0.459791 |
cb = train_data[['Tweet','Sentiment']]
cb["Sentiment"]= cb["Sentiment"].replace('Positive',1)
cb["Sentiment"]= cb["Sentiment"].replace('Extremely Positive',1)
cb["Sentiment"]= cb["Sentiment"].replace('Neutral',1)
cb["Sentiment"]= cb["Sentiment"].replace('Negative',0)
cb["Sentiment"]= cb["Sentiment"].replace('Extremely Negative',0)
X = cb.drop('Sentiment', axis=1)
y = cb.Sentiment
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
cb['Tweet'].apply(lambda x: [item for item in x if item not in stop])
from sklearn.model_selection import train_test_split
train,valid = train_test_split(cb,test_size = 0.2,random_state=0,stratify = cb.Sentiment.values) #stratification means that the train_test_split method returns training and test subsets that have the same proportions of class labels as the input dataset.
print("train shape : ", train.shape)
print("valid shape : ", valid.shape)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
stop = list(stopwords.words('english'))
vectorizer = CountVectorizer(decode_error = 'replace',stop_words = stop)
X_train = vectorizer.fit_transform(train.Tweet.values)
X_valid = vectorizer.transform(valid.Tweet.values)
y_train = train.Sentiment.values
y_valid = valid.Sentiment.values
print("X_train.shape : ", X_train.shape)
print("X_train.shape : ", X_valid.shape)
print("y_train.shape : ", y_train.shape)
print("y_valid.shape : ", y_valid.shape)
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
naiveByes_clf = MultinomialNB()
naiveByes_clf.fit(X_train,y_train)
NB_prediction = naiveByes_clf.predict(X_valid)
NB_accuracy = accuracy_score(y_valid,NB_prediction)
print("training accuracy Score : ",naiveByes_clf.score(X_train,y_train))
print("Validation accuracy Score : ",NB_accuracy )
print(classification_report(NB_prediction,y_valid))
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train,y_train)
rf_prediction = rf_clf.predict(X_valid)
rf_accuracy = accuracy_score(y_valid,rf_prediction)
print("Training accuracy Score : ",rf_clf.score(X_train,y_train))
print("Validation accuracy Score : ",rf_accuracy )
print(classification_report(rf_prediction,y_valid))
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logreg_prediction = logreg.predict(X_valid)
logreg_accuracy = accuracy_score(y_valid,logreg_prediction)
print("Training accuracy Score : ",logreg.score(X_train,y_train))
print("Validation accuracy Score : ",logreg_accuracy )
print(classification_report(logreg_prediction,y_valid))
# Catboost Algorithm
clf2 = CatBoostClassifier()
clf2.fit(X_train, y_train,
eval_set=(X_valid, y_valid),
verbose=False
)
print('CatBoost model is fitted: ' + str(clf2.is_fitted()))
print('CatBoost model parameters:')
print(clf2.get_params())
catboost_prediction = clf2.predict(X_valid)
catboost_accuracy = accuracy_score(y_valid,catboost_prediction)
print("Training accuracy Score : ",clf2.score(X_train,y_train))
print("Validation accuracy Score : ",catboost_accuracy )
print(classification_report(catboost_prediction,y_valid))
# Extreme Gradient Boosting
import xgboost as xgb
xgboost_clf = xgb.XGBClassifier()
xgboost_clf.fit(X_train, y_train)
xgb_prediction = xgboost_clf.predict(X_valid)
xgb_accuracy = accuracy_score(y_valid,xgb_prediction)
print("Training accuracy Score : ",xgboost_clf.score(X_train,y_train))
print("Validation accuracy Score : ",xgb_accuracy )
print(classification_report(xgb_prediction,y_valid))
# Support vector machine
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)
svc_prediction = svc.predict(X_valid)
svc_accuracy = accuracy_score(y_valid,svc_prediction)
print("Training accuracy Score : ",svc.score(X_train,y_train))
print("Validation accuracy Score : ",svc_accuracy )
print(classification_report(svc_prediction,y_valid))
# Stochastic Gradient Descent-SGD Classifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(loss = 'hinge', penalty = 'l2', random_state=0)
sgd_clf.fit(X_train,y_train)
sgd_prediction = sgd_clf.predict(X_valid)
sgd_accuracy = accuracy_score(y_valid,sgd_prediction)
print("Training accuracy Score : ",sgd_clf.score(X_train,y_train))
print("Validation accuracy Score : ",sgd_accuracy )
print(classification_report(sgd_prediction,y_valid))
<ipython-input-20-481234dba865>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-20-481234dba865>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-20-481234dba865>:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-20-481234dba865>:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-20-481234dba865>:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\cheny\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
train shape : (32925, 2)
valid shape : (8232, 2)
X_train.shape : (32925, 53194)
X_train.shape : (8232, 53194)
y_train.shape : (32925,)
y_valid.shape : (8232,)
training accuracy Score : 0.8920577069096431
Validation accuracy Score : 0.7889941690962099
precision recall f1-score support
0 0.66 0.74 0.70 2745
1 0.86 0.81 0.84 5487
accuracy 0.79 8232
macro avg 0.76 0.78 0.77 8232
weighted avg 0.80 0.79 0.79 8232
Training accuracy Score : 0.9999392558845862
Validation accuracy Score : 0.8264091350826045
precision recall f1-score support
0 0.68 0.82 0.75 2559
1 0.91 0.83 0.87 5673
accuracy 0.83 8232
macro avg 0.80 0.83 0.81 8232
weighted avg 0.84 0.83 0.83 8232
C:\Users\cheny\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
Training accuracy Score : 0.9641002277904328
Validation accuracy Score : 0.8613945578231292
precision recall f1-score support
0 0.77 0.84 0.81 2811
1 0.92 0.87 0.89 5421
accuracy 0.86 8232
macro avg 0.84 0.86 0.85 8232
weighted avg 0.87 0.86 0.86 8232
CatBoost model is fitted: True
CatBoost model parameters:
{}
Training accuracy Score : 0.8743507972665148
Validation accuracy Score : 0.8439018464528668
precision recall f1-score support
0 0.70 0.85 0.77 2529
1 0.93 0.84 0.88 5703
accuracy 0.84 8232
macro avg 0.82 0.85 0.83 8232
weighted avg 0.86 0.84 0.85 8232
[10:24:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
C:\Users\cheny\anaconda3\lib\site-packages\xgboost\sklearn.py:1146: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
Training accuracy Score : 0.8381169324221716
Validation accuracy Score : 0.8170553935860059
precision recall f1-score support
0 0.62 0.85 0.72 2272
1 0.93 0.81 0.86 5960
accuracy 0.82 8232
macro avg 0.78 0.83 0.79 8232
weighted avg 0.85 0.82 0.82 8232
Training accuracy Score : 0.9621564160971906
Validation accuracy Score : 0.831754130223518
precision recall f1-score support
0 0.66 0.85 0.75 2389
1 0.93 0.82 0.87 5843
accuracy 0.83 8232
macro avg 0.80 0.84 0.81 8232
weighted avg 0.85 0.83 0.84 8232
Training accuracy Score : 0.9607593014426727
Validation accuracy Score : 0.8604227405247813
precision recall f1-score support
0 0.78 0.84 0.81 2851
1 0.91 0.87 0.89 5381
accuracy 0.86 8232
macro avg 0.84 0.86 0.85 8232
weighted avg 0.86 0.86 0.86 8232
models = pd.DataFrame({
'Model': ['Support Vector Machines', 'Logistic Regression',
'Random Forest', 'Naive Bayes',
'Stochastic Gradient Decent', 'XGBoost','CatBoost'],
'Test accuracy': [svc_accuracy, logreg_accuracy,
rf_accuracy, NB_accuracy,
sgd_accuracy, xgb_accuracy,catboost_accuracy]})
models.sort_values(by='Test accuracy', ascending=False)
| Model | Test accuracy | |
|---|---|---|
| 1 | Logistic Regression | 0.861395 |
| 4 | Stochastic Gradient Decent | 0.860423 |
| 6 | CatBoost | 0.843902 |
| 0 | Support Vector Machines | 0.831754 |
| 2 | Random Forest | 0.826409 |
| 5 | XGBoost | 0.817055 |
| 3 | Naive Bayes | 0.788994 |